This is a study case used as a capstone for the Google Data Analytics certificate Problem: My favorite activity in the world is sleeping. Could I optimize it by controlling the different variables that might affect it?
Solution: Let’s dive into a sleeping study to discover which variables will improve or ruin a good night sleep!
library(tidyverse)
library(ggplot2)
library(patchwork)
data <- read.csv("sleepdata.csv")
head(data)
## Person.ID Gender Age Occupation Sleep.Duration Quality.of.Sleep
## 1 1 Male 27 Software Engineer 6.1 6
## 2 2 Male 28 Doctor 6.2 6
## 3 3 Male 28 Doctor 6.2 6
## 4 4 Male 28 Sales Representative 5.9 4
## 5 5 Male 28 Sales Representative 5.9 4
## 6 6 Male 28 Software Engineer 5.9 4
## Physical.Activity.Level Stress.Level BMI.Category Blood.Pressure Heart.Rate
## 1 42 6 Overweight 126/83 77
## 2 60 8 Normal 125/80 75
## 3 60 8 Normal 125/80 75
## 4 30 8 Obese 140/90 85
## 5 30 8 Obese 140/90 85
## 6 30 8 Obese 140/90 85
## Daily.Steps Sleep.Disorder
## 1 4200 None
## 2 10000 None
## 3 10000 None
## 4 3000 Sleep Apnea
## 5 3000 Sleep Apnea
## 6 3000 Insomnia
summary(data)
## Person.ID Gender Age Occupation
## Min. : 1.00 Length:374 Min. :27.00 Length:374
## 1st Qu.: 94.25 Class :character 1st Qu.:35.25 Class :character
## Median :187.50 Mode :character Median :43.00 Mode :character
## Mean :187.50 Mean :42.18
## 3rd Qu.:280.75 3rd Qu.:50.00
## Max. :374.00 Max. :59.00
## Sleep.Duration Quality.of.Sleep Physical.Activity.Level Stress.Level
## Min. :5.800 Min. :4.000 Min. :30.00 Min. :3.000
## 1st Qu.:6.400 1st Qu.:6.000 1st Qu.:45.00 1st Qu.:4.000
## Median :7.200 Median :7.000 Median :60.00 Median :5.000
## Mean :7.132 Mean :7.313 Mean :59.17 Mean :5.385
## 3rd Qu.:7.800 3rd Qu.:8.000 3rd Qu.:75.00 3rd Qu.:7.000
## Max. :8.500 Max. :9.000 Max. :90.00 Max. :8.000
## BMI.Category Blood.Pressure Heart.Rate Daily.Steps
## Length:374 Length:374 Min. :65.00 Min. : 3000
## Class :character Class :character 1st Qu.:68.00 1st Qu.: 5600
## Mode :character Mode :character Median :70.00 Median : 7000
## Mean :70.17 Mean : 6817
## 3rd Qu.:72.00 3rd Qu.: 8000
## Max. :86.00 Max. :10000
## Sleep.Disorder
## Length:374
## Class :character
## Mode :character
##
##
##
#Number of null values per column
colSums(is.na(data))
## Person.ID Gender Age
## 0 0 0
## Occupation Sleep.Duration Quality.of.Sleep
## 0 0 0
## Physical.Activity.Level Stress.Level BMI.Category
## 0 0 0
## Blood.Pressure Heart.Rate Daily.Steps
## 0 0 0
## Sleep.Disorder
## 0
gen_avg <- data %>%
group_by(Gender) %>%
summarise(mean_age = mean(Age), n = n())
gen_avg
## # A tibble: 2 × 3
## Gender mean_age n
## <chr> <dbl> <int>
## 1 Female 47.4 185
## 2 Male 37.1 189
The average age of Women is higher. This could skew the data as age could be an important factor in sleep quality. But we do have a 50/50 gender representation.
data$Sleep.Disorder = factor(data$Sleep.Disorder, levels = c('None','Insomnia','Sleep Apnea'))
df <- data %>%
group_by(Sleep.Disorder) %>% # Variable to be transformed
count() %>%
ungroup() %>%
mutate(perc = `n` / sum(`n`)) %>%
arrange(perc) %>%
mutate(labels = scales::percent(perc))
ggplot(df, aes(x = "", y = perc, fill = Sleep.Disorder)) +
geom_col() +
geom_label(aes(label = labels), color = "black",
position = position_stack(vjust = 0.5),
show.legend = FALSE) +
scale_fill_brewer(palette = "Reds") +
coord_polar("y", start = 0) +
theme_void() +
ggtitle("Percentage of Sleep Disorders") +
theme(plot.title = element_text(hjust=0.5))
We do not have an equal representation of every sleep class, but we almost have a 60/40 representation of healthy and disease respectively.
data$BMI.Category <- gsub("Normal Weight", "Normal", data$BMI.Category)
data$BMI.Category <- factor(data$BMI.Category, labels=c('Normal','Overweight','Obese'))
bmi_bar <- ggplot(data, aes(x=BMI.Category, fill=Sleep.Disorder)) +
scale_fill_brewer(palette="Reds") +
geom_bar() +
ggtitle("BMI effect on Sleep") +
theme(plot.title = element_text(hjust=0.5))
bmi_bar
We cleaned up this column by merging “Normal” and “Normal Weight” samples. The overweight category seems severely under represented.
A conclusion we can pull from this figure is that BMI is correlated with sleep quality A higher BMI tends to indicate insomnia and sleep apnea, while in the lowest BMI category has a majority of normal sleep. Obese class has almost 50/50 split on both sleep disorders. We cannot make an conclusions on the Overweight category due to the small sample size but it does seem to indicate a correlation with sleep disorders.
occup_point <- ggplot(data, aes(x=Occupation, y=Stress.Level)) +
geom_point(aes(fill=Sleep.Disorder,size=Age), color='black', shape=21, stroke=0.4) +
scale_fill_brewer(palette="Reds") +
theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=0.5),
plot.margin = margin(t = 5, r = 10, b = 5, l = 10),
axis.title.x = element_text(margin = margin(t = 20)),
plot.title = element_text(hjust=0.5)) +
ggtitle("Occupation effect on Sleep")
occup_bar <- ggplot(data, aes(x=Occupation, fill=Sleep.Disorder)) +
geom_bar() +
scale_fill_brewer(palette="Reds")+
ggtitle("Occupation, Age and Stress effect on Sleep") +
theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=0.7),
plot.margin = margin(t = 5, r = 10, b = 5, l = 10),
axis.title.x = element_text(margin = margin(t = 20)),
plot.title = element_text(hjust=0.5))
occup_bar + occup_point
BP_ranges = c('Normal', 'Elevated', 'Hypertension_1', 'Hypertension_2', 'Hypertensive_Crisis')
BP_systolic_limits = list(c(0,120),c(120,130),c(130,140),c(140,180),c(180,200))
BP_diastolic_limits = list(c(0,80),c(0,80),c(80,90),c(90,120),c(120,140))
data <- data %>%
separate(Blood.Pressure, into=c('Systolic','Diastolic'), sep="/") %>%
mutate(Systolic = as.numeric(Systolic),
Diastolic = as.numeric(Diastolic)) %>%
mutate(BP_range = case_when(
between(Systolic, BP_systolic_limits[[1]][1], BP_systolic_limits[[1]][2]) &
between(Diastolic, BP_diastolic_limits[[1]][1], BP_diastolic_limits[[1]][2]) ~ BP_ranges[1],
between(Systolic, BP_systolic_limits[[2]][1], BP_systolic_limits[[2]][2]) &
between(Diastolic, BP_diastolic_limits[[2]][1], BP_diastolic_limits[[2]][2]) ~ BP_ranges[2],
between(Systolic, BP_systolic_limits[[3]][1], BP_systolic_limits[[3]][2]) &
between(Diastolic, BP_diastolic_limits[[3]][1], BP_diastolic_limits[[3]][2]) ~ BP_ranges[3],
between(Systolic, BP_systolic_limits[[4]][1], BP_systolic_limits[[4]][2]) &
between(Diastolic, BP_diastolic_limits[[4]][1], BP_diastolic_limits[[4]][2]) ~ BP_ranges[4],
between(Systolic, BP_systolic_limits[[5]][1], BP_systolic_limits[[5]][2]) &
between(Diastolic, BP_diastolic_limits[[5]][1], BP_diastolic_limits[[5]][2]) ~ BP_ranges[5],
TRUE ~ "Unknown" # Default case
))
# New values to replace "Unknown"
replacement_values <- c(rep("Elevated", 12), rep("Hypertension_1", 2), "Elevated")
data <- data %>%
mutate(BP_range = replace(BP_range, BP_range == "Unknown", replacement_values)) %>%
mutate(Heart.Rate = as.factor(Heart.Rate))
data$Heart.Rate <- as.numeric(as.character(data$Heart.Rate))
# Group by BP_range and Heart.Rate, then count occurrences
data_grouped <- data %>%
group_by(BP_range, Heart.Rate, Sleep.Disorder, Age) %>%
summarise(Count = n(), .groups = "drop") # Count occurrences of each Heart Rate
# Convert BP_range into a factor with the specified order
data_grouped$BP_range <- factor(data_grouped$BP_range, levels = BP_ranges)
blood_bar <- ggplot(data_grouped, aes(x = BP_range, y = Count, fill = Heart.Rate)) +
geom_col() +
scale_fill_viridis_c(option = "magma", direction = -1) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
blood_bar
We separated the “Blood.Pressure” column into “Systolic” (#/) and “Diastolic” (/#) categories and then depending on their combination create a categorical blood pressure range column: “BP_range” from the AHA . We manually clean up the “Unknown” ranges by making assumptions (putting more importance on the systolic number) and assigning them a category.
blood_sleep <- ggplot(data_grouped, aes(x=BP_range, y=Heart.Rate)) +
geom_point(aes(fill=Sleep.Disorder,size=Age), color='black', shape=21, stroke=0.4 ) +
scale_fill_brewer(palette="Reds")
blood_sleep
data <- data %>%
mutate(Stress.Level = as.factor(Stress.Level))
act_bar <- ggplot(data, aes(x=Physical.Activity.Level,y=Daily.Steps)) +
geom_point(aes(fill=Stress.Level, size=Age), color='black', shape=21, stroke=0.4) +
scale_fill_brewer(palette="Reds") +
geom_smooth(method=lm, linetype="dashed",
color="darkred")
act_bar
sleep_q <- ggplot(data, aes(x=Sleep.Duration,y=Quality.of.Sleep)) +
geom_point(aes(fill=Sleep.Disorder), color='black', shape=21, stroke=0.4, size=3) +
scale_fill_brewer(palette="Reds")
sleep_q
gend_group <- data %>%
group_by(Gender,Sleep.Disorder,Age)
gender_bar <- ggplot(data, aes(x=Gender, y=Age, fill=Sleep.Disorder)) +
scale_fill_brewer(palette="Reds") +
geom_bar(stat="summary", fun.y = "mean", position="dodge")
gender_bar
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.